import pandas as pd
import plotly
import plotly_express as px
plotly.offline.init_notebook_mode()
df = pd.read_csv("casestudy.csv")
revenue_by_year_df = df.groupby("year")["net_revenue"].sum()
years = revenue_by_year_df.index
revenues = revenue_by_year_df.values
fig = px.bar(x=years, y=revenues)
fig.update_layout(title_text="Revenue by year", yaxis_title="Revenue", xaxis_title = "Year", xaxis=dict(tickmode='linear'))
fig.show()
"""
returns a dataframe of year:net_revenue
where net_revenue is the total revenue of the new custoemrs for each eyar
"""
def new_customer_revenue_per_year():
new_customer_revenue = []
emails_by_year = df.groupby('year')['customer_email'].unique()
for year, emails in emails_by_year.iteritems():
new_emails = set(emails) - set(emails_by_year.get(year-1, []))
new_customers_df = df[df['customer_email'].isin(new_emails) & (df['year'] == year)]
new_customer_revenue.append((year, new_customers_df['net_revenue'].sum()))
return pd.DataFrame(new_customer_revenue, columns=['year', 'net_revenue'])
new_customer_revenue_df = new_customer_revenue_per_year()
years = new_customer_revenue_df['year']
revenues = new_customer_revenue_df['net_revenue']
fig = px.bar(x=years[1:], y=revenues[1:])
#fig = px.bar(x=years, y=revenues)
fig.update_layout(title_text="Revenue of new customers by year", yaxis_title="Revenue", xaxis_title = "Year", xaxis=dict(tickmode='linear'))
fig.show()
The visualizations below depict data for all years except the first one, as the case study requested information that required data from previous years.
revenue_temp = 0
growth=[]
for revenue in revenue_by_year_df:
growth.append(revenue-revenue_temp)
revenue_temp = revenue
fig = px.line(x=years[1:], y=growth[1:])
fig.update_layout(title_text="Growth", yaxis_title="Current-Previous Year revenue", xaxis_title = "Year",xaxis=dict(tickmode='linear'))
fig.show()
by_year_df = df.groupby('year')
lost_revenue_years = years[:-1]
lost_revenue = []
# Iterate through each year
for year, year_df in by_year_df:
if year == 2017: break
# Identify the customers who made a purchase in the current year but not in the following year
current_year_customers = set(year_df['customer_email'])
next_year_customers = set(by_year_df.get_group(year + 1)['customer_email'])
lost_customers = current_year_customers - next_year_customers
lost_revenue.append(year_df[year_df['customer_email'].isin(lost_customers)]['net_revenue'].sum())
fig = px.bar(x=lost_revenue_years, y=lost_revenue)
fig.update_layout(title_text="Lost Revenue by Attrition", yaxis_title="Lost revenue", xaxis_title = "Year", xaxis=dict(tickmode='linear'))
fig.show()
# these are what i am looking for
added_revenue_per_year = [] # list of tuples containing the current and prior year revenue for existing customers
total_customers_per_year = [] # list of tuples containing the number of current and prior year customers
new_customers_per_year = [] # list of integers representing the number of new customers for each year
lost_customers_per_year = [] # list of integers representing the number of lost customers for each year
#Iterate through each year
by_year_df = df.groupby('year')
for year, year_df in by_year_df:
# skip first year since there are not existing customers
if year == 2015: continue
# Identify the customers who made a purchase in the current year and in the previous year
current_year_customers = set(year_df['customer_email'])
previous_year_customers = set(by_year_df.get_group(year - 1)['customer_email'])
existing_customers = current_year_customers & previous_year_customers # customer that exist both the prior & current year
prior_df = by_year_df.get_group(year-1)
# compute current and prior year revenue
current_year_revenue = year_df[year_df['customer_email'].isin(existing_customers)]['net_revenue'].sum()
prior_year_revenue = prior_df[prior_df['customer_email'].isin(existing_customers)]['net_revenue'].sum()
# append results to the lists
added_revenue_per_year.append((current_year_revenue, prior_year_revenue))
# compute customers per year
total_customers_per_year.append((len(current_year_customers),len(previous_year_customers)))
# compute numner of new and lost customer per year
new_customers_per_year.append(len(current_year_customers - existing_customers))
lost_customers_per_year.append(len(previous_year_customers - existing_customers))
import plotly.express as px
df = pd.DataFrame({'year': years[1:],
'Current Year Revenue': [x[0] for x in added_revenue_per_year],
'Prior Year Revenue': [x[1] for x in added_revenue_per_year]})
title = "Revenue by year for customers present in that year and prior years"
fig = px.bar(df, x='year', y=['Current Year Revenue', 'Prior Year Revenue'], title=title, barmode='group', labels={'y': 'Existing Customer Revenue (USD)', 'x': 'Year'})
fig.update_layout(yaxis_title='Revenue', xaxis=dict(tickmode='linear'))
fig.show()
import plotly.express as px
df = pd.DataFrame({'year': years[1:],
'Current Year Customers': [x[0] for x in total_customers_per_year],
'Prior Year Customers': [x[1] for x in total_customers_per_year]})
fig = px.bar(df, x='year', y=['Current Year Customers', 'Prior Year Customers'], title='Number of Customers by Year', barmode='group')
fig.update_layout(yaxis_title='Number of Customers', xaxis=dict(tickmode='linear'))
fig.show()
import plotly.express as px
df = pd.DataFrame({'year': years[1:],
'New Customers': new_customers_per_year,
'Lost Customers': lost_customers_per_year})
fig = px.bar(df, x='year', y=['New Customers', 'Lost Customers'], title='Number of New and Lost Customers by Year', barmode='group')
fig.update_layout(yaxis_title='Number of Customers', xaxis=dict(tickmode='linear'))
fig.show()
Annually, there is a significant influx and outflux of customers. In particular, the period from 2015 to 2016 saw a decrease in total revenue by 3.3 million, attributed to a decrease in customer numbers. Customers who made purchases in a given year and the preceding year contribute the same amount of revenue in each year. The loss of customers, or customer attrition, also significantly impacts revenue loss on a yearly basis.